!--------------------------------------------------------------------------------------------------!
! Copyright (C) by the DBCSR developers group - All rights reserved                                !
! This file is part of the DBCSR library.                                                          !
!                                                                                                  !
! For information on the license, see the LICENSE file.                                            !
! For further information please visit https://dbcsr.cp2k.org                                      !
! SPDX-License-Identifier: GPL-2.0+                                                                !
!--------------------------------------------------------------------------------------------------!

MODULE dbcsr_work_operations
   !! DBCSR work matrix utilities

   USE dbcsr_array_types, ONLY: array_data, &
                                array_hold, &
                                array_i1d_obj, &
                                array_new, &
                                array_nullify, &
                                array_release, &
                                array_size
   USE dbcsr_btree, ONLY: btree_data_cp2d, &
                          btree_data_dp2d, &
                          btree_data_sp2d, &
                          btree_data_zp2d
   USE dbcsr_block_operations, ONLY: block_copy_c, &
                                     block_copy_d, &
                                     block_copy_s, &
                                     block_copy_z, &
                                     dbcsr_data_copy, &
                                     dbcsr_data_set
   USE dbcsr_config, ONLY: default_resize_factor
   USE dbcsr_data_methods, ONLY: &
      dbcsr_data_clear_pointer, dbcsr_data_ensure_size, dbcsr_data_get_memory_type, &
      dbcsr_data_get_size, dbcsr_data_get_size_referenced, dbcsr_data_get_type, dbcsr_data_hold, &
      dbcsr_data_init, dbcsr_data_new, dbcsr_data_release, dbcsr_data_set_size_referenced, &
      dbcsr_data_valid, dbcsr_get_data_p_c, dbcsr_get_data_p_d, dbcsr_get_data_p_s, &
      dbcsr_get_data_p_z
   USE dbcsr_data_operations, ONLY: dbcsr_data_copyall, &
                                    dbcsr_sort_data, &
                                    dbcsr_switch_data_area
   USE dbcsr_dist_methods, ONLY: dbcsr_distribution_has_threads, &
                                 dbcsr_distribution_hold, &
                                 dbcsr_distribution_make_threads, &
                                 dbcsr_distribution_ncols, &
                                 dbcsr_distribution_nrows, &
                                 dbcsr_distribution_release
   USE dbcsr_index_operations, ONLY: dbcsr_addto_index_array, &
                                     dbcsr_build_row_index, &
                                     dbcsr_clearfrom_index_array, &
                                     dbcsr_index_prune_deleted, &
                                     dbcsr_make_dbcsr_index, &
                                     dbcsr_make_index_exist, &
                                     dbcsr_repoint_index, &
                                     dbcsr_sort_indices
   USE dbcsr_iterator_operations, ONLY: dbcsr_iterator_blocks_left, &
                                        dbcsr_iterator_next_block, &
                                        dbcsr_iterator_start, &
                                        dbcsr_iterator_stop
   USE dbcsr_mem_methods, ONLY: dbcsr_memtype_equal
   USE dbcsr_methods, ONLY: &
      dbcsr_col_block_sizes, dbcsr_distribution, dbcsr_get_data_memory_type, &
      dbcsr_get_data_size_used, dbcsr_get_data_type, dbcsr_get_index_memory_type, &
      dbcsr_get_matrix_type, dbcsr_get_replication_type, dbcsr_matrix_counter, &
      dbcsr_max_col_size, dbcsr_max_row_size, dbcsr_mutable_destroy, dbcsr_mutable_init, &
      dbcsr_mutable_instantiated, dbcsr_mutable_new, dbcsr_mutable_release, dbcsr_name, &
      dbcsr_row_block_sizes, dbcsr_use_mutable, dbcsr_valid_index, dbcsr_wm_use_mutable
   USE dbcsr_ptr_util, ONLY: ensure_array_size
   USE dbcsr_toollib, ONLY: dbcsr_unpack_i8_2i4, &
                            sort
   USE dbcsr_string_utilities, ONLY: uppercase
   USE dbcsr_types, ONLY: &
      dbcsr_data_obj, dbcsr_distribution_obj, dbcsr_iterator, dbcsr_memtype_default, &
      dbcsr_memtype_type, dbcsr_meta_size, dbcsr_num_slots, dbcsr_repl_col, dbcsr_repl_full, &
      dbcsr_repl_none, dbcsr_repl_row, dbcsr_slot_blk_p, dbcsr_slot_col_i, dbcsr_slot_home_coli, &
      dbcsr_slot_home_pcol, dbcsr_slot_home_prow, dbcsr_slot_home_rowi, dbcsr_slot_home_vpcol, &
      dbcsr_slot_home_vprow, dbcsr_slot_nblkcols_local, dbcsr_slot_nblkcols_total, &
      dbcsr_slot_nblkrows_local, dbcsr_slot_nblkrows_total, dbcsr_slot_nblks, &
      dbcsr_slot_nfullcols_local, dbcsr_slot_nfullcols_total, dbcsr_slot_nfullrows_local, &
      dbcsr_slot_nfullrows_total, dbcsr_slot_nze, dbcsr_slot_row_p, dbcsr_slot_size, dbcsr_type, &
      dbcsr_type_antihermitian, dbcsr_type_antisymmetric, dbcsr_type_complex_4, &
      dbcsr_type_complex_8, dbcsr_type_hermitian, dbcsr_type_no_symmetry, dbcsr_type_real_4, &
      dbcsr_type_real_8, dbcsr_type_real_default, dbcsr_type_symmetric, dbcsr_work_type
   USE dbcsr_dist_util, ONLY: convert_sizes_to_offsets, &
                              dbcsr_calc_block_sizes, &
                              dbcsr_verify_matrix, &
                              meta_from_dist
   USE dbcsr_kinds, ONLY: default_string_length, &
                          int_8, &
                          real_4, &
                          real_8
#include "base/dbcsr_base_uses.f90"

!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, omp_in_parallel

   IMPLICIT NONE
   PRIVATE

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_work_operations'

   LOGICAL, PARAMETER :: careful_mod = .FALSE.

   PUBLIC :: dbcsr_create, dbcsr_work_create, dbcsr_finalize, dbcsr_special_finalize
   PUBLIC :: dbcsr_add_wm_from_matrix, &
             add_work_coordinate
   PUBLIC :: dbcsr_work_destroy

   INTERFACE dbcsr_create
      MODULE PROCEDURE dbcsr_create_new, dbcsr_create_template
   END INTERFACE

   TYPE i_array_p
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: p => NULL()
   END TYPE i_array_p

CONTAINS

   SUBROUTINE dbcsr_create_new(matrix, name, dist, matrix_type, &
                               row_blk_size, col_blk_size, row_blk_size_obj, col_blk_size_obj, &
                               nze, data_type, data_buffer, &
                               data_memory_type, index_memory_type, &
                               max_rbs, max_cbs, &
                               row_blk_offset, col_blk_offset, &
                               thread_dist, &
                               reuse, reuse_arrays, mutable_work, make_index, replication_type)
      !! Creates a matrix, allocating the essentials.
      !!
      !! The matrix itself is allocated, as well as the essential parts of
      !! the index. When passed the nze argument, the data is also allocated
      !! to that size.
      !! see dbcsr_types.F

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! new matrix
      CHARACTER(len=*), INTENT(IN)                       :: name
      TYPE(dbcsr_distribution_obj), INTENT(IN)           :: dist
         !! distribution_2d distribution
      CHARACTER, INTENT(IN)                              :: matrix_type
         !! 'N' for normal, 'T' for transposed, 'S' for symmetric, and 'A' for antisymmetric
      INTEGER, DIMENSION(:), INTENT(INOUT), POINTER, &
         CONTIGUOUS, OPTIONAL                            :: row_blk_size, col_blk_size
      TYPE(array_i1d_obj), INTENT(IN), OPTIONAL          :: row_blk_size_obj, col_blk_size_obj
      INTEGER, INTENT(IN), OPTIONAL                      :: nze, data_type
         !! number of elements
         !! type of data from 'rRcC' for single/double precision real/complex, default is 'R'
      TYPE(dbcsr_data_obj), INTENT(IN), OPTIONAL         :: data_buffer
      TYPE(dbcsr_memtype_type), INTENT(IN), OPTIONAL     :: data_memory_type, index_memory_type
         !! allocate indices and data using special memory
         !! allocate indices using special memory
      INTEGER, INTENT(IN), OPTIONAL                      :: max_rbs, max_cbs
      TYPE(array_i1d_obj), INTENT(IN), OPTIONAL          :: row_blk_offset, col_blk_offset
      TYPE(dbcsr_distribution_obj), INTENT(IN), OPTIONAL :: thread_dist
      LOGICAL, INTENT(IN), OPTIONAL                      :: reuse, reuse_arrays, mutable_work, &
                                                            make_index
         !! reuses an existing matrix, default is to create a fresh one
         !! uses the mutable data for working and not the append-only data; default is append-only
      CHARACTER, INTENT(IN), OPTIONAL                    :: replication_type
         !! replication to be used for this matrix; default is dbcsr_repl_none

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_create_new'

      CHARACTER                                          :: matrix_type_l
      INTEGER                                            :: handle, my_nze
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: vec_col_blk_offset, vec_row_blk_offset
      INTEGER, DIMENSION(dbcsr_meta_size)                :: new_meta
      LOGICAL                                            :: hijack, my_make_index

!   ---------------------------------------------------------------------------

      MARK_USED(thread_dist) ! only used with OMP

      CALL timeset(routineN, handle)

      ! Reuse matrix only if has actually been allocated.
      hijack = ASSOCIATED(matrix%index)
      IF (PRESENT(reuse)) hijack = reuse

      my_make_index = .TRUE.
      IF (PRESENT(make_index)) my_make_index = make_index

      IF (.NOT. hijack) THEN
         matrix = dbcsr_type()
         matrix%refcount = 1
      END IF
!$OMP     CRITICAL (crit_counter)
      matrix%serial_number = dbcsr_matrix_counter
      dbcsr_matrix_counter = dbcsr_matrix_counter + 1
!$OMP     END CRITICAL (crit_counter)
      ! Mark matrix index as having an invalid index.
      matrix%valid = .FALSE.
      matrix%name = name
      ! Sets the type of matrix building/modifying work structures.
      IF (PRESENT(mutable_work)) THEN
         matrix%work_mutable = mutable_work
      ELSE
         matrix%work_mutable = .FALSE.
      END IF
      ! Sets the correct data type.
      IF (PRESENT(data_type)) THEN
         SELECT CASE (data_type)
         CASE (dbcsr_type_real_4)
            matrix%data_type = dbcsr_type_real_4
         CASE (dbcsr_type_real_8)
            matrix%data_type = dbcsr_type_real_8
         CASE (dbcsr_type_complex_4)
            matrix%data_type = dbcsr_type_complex_4
         CASE (dbcsr_type_complex_8)
            matrix%data_type = dbcsr_type_complex_8
         CASE DEFAULT
            DBCSR_ABORT("Invalid matrix type")
         END SELECT
      ELSE
         matrix%data_type = dbcsr_type_real_default
      END IF

      matrix%data_memory_type = dbcsr_memtype_default
      IF (PRESENT(data_memory_type)) &
         matrix%data_memory_type = data_memory_type

      matrix%index_memory_type = dbcsr_memtype_default
      IF (PRESENT(index_memory_type)) &
         matrix%index_memory_type = index_memory_type

      IF (hijack) THEN
         ! Release/deallocate elements that are replaced or not needed
         ! by the new matrix. This is similar to what dbcsr_destroy
         ! does, except that it keeps the index and data.
         CALL array_release(matrix%row_blk_size)
         CALL array_release(matrix%col_blk_size)
         CALL array_release(matrix%row_blk_offset)
         CALL array_release(matrix%col_blk_offset)
         IF (matrix%has_local_rows) &
            CALL array_release(matrix%local_rows)
         IF (matrix%has_global_rows) &
            CALL array_release(matrix%global_rows)
         IF (matrix%has_local_cols) &
            CALL array_release(matrix%local_cols)
         IF (matrix%has_global_cols) &
            CALL array_release(matrix%global_cols)
         CALL dbcsr_distribution_release(matrix%dist)
         IF (ASSOCIATED(matrix%wms)) THEN
            CALL dbcsr_work_destroy_all(matrix)
         END IF
         CALL array_nullify(matrix%local_rows)
         CALL array_nullify(matrix%global_rows)
         CALL array_nullify(matrix%local_cols)
         CALL array_nullify(matrix%global_cols)
         !
         IF (matrix%data_type /= matrix%data_area%d%data_type) &
            DBCSR_ABORT("Inconsistent data type for the existing buffer.")
         CALL dbcsr_data_set_size_referenced(matrix%data_area, 0)
      ELSE
         ! Invalidate index
         NULLIFY (matrix%index)
         ! Invalidate data
         IF (PRESENT(data_buffer)) THEN
            IF (.NOT. dbcsr_data_valid(data_buffer)) &
               DBCSR_ABORT("Input data buffer not valid.")
            IF (matrix%data_type /= data_buffer%d%data_type) &
               DBCSR_ABORT("Input buffer data type different by matrix data type.")
            matrix%data_memory_type = data_buffer%d%memory_type
            matrix%data_area = data_buffer
            CALL dbcsr_data_hold(matrix%data_area)
         ELSE
            CALL dbcsr_data_init(matrix%data_area)
         END IF
      END IF
      ! These are always invalidated.
      NULLIFY (matrix%row_p, matrix%col_i, matrix%blk_p, matrix%thr_c, &
               matrix%coo_l)
      IF (PRESENT(row_blk_size_obj)) THEN
         matrix%row_blk_size = row_blk_size_obj
         CALL array_hold(matrix%row_blk_size)
      ELSEIF (PRESENT(row_blk_size)) THEN
         CALL array_new(matrix%row_blk_size, row_blk_size, gift=reuse_arrays)
      ELSE
         DBCSR_ABORT("Missing row_blk_size")
      END IF
      IF (PRESENT(max_rbs)) THEN
         matrix%max_rbs = max_rbs
      ELSE IF (array_size(matrix%row_blk_size) .GT. 0) THEN
         matrix%max_rbs = MAXVAL(array_data(matrix%row_blk_size))
      ELSE
         matrix%max_rbs = 0
      END IF
      IF (PRESENT(col_blk_size_obj)) THEN
         matrix%col_blk_size = col_blk_size_obj
         CALL array_hold(matrix%col_blk_size)
      ELSEIF (PRESENT(col_blk_size)) THEN
         CALL array_new(matrix%col_blk_size, col_blk_size, gift=reuse_arrays)
      ELSE
         DBCSR_ABORT("Missing col_blk_size")
      END IF
      IF (PRESENT(max_cbs)) THEN
         matrix%max_cbs = max_cbs
      ELSE IF (array_size(matrix%col_blk_size) .GT. 0) THEN
         matrix%max_cbs = MAXVAL(array_data(matrix%col_blk_size))
      ELSE
         matrix%max_cbs = 0
      END IF
      !
      IF (array_size(matrix%row_blk_size) /= dbcsr_distribution_nrows(dist)) &
         DBCSR_ABORT("Number of blocked rows does match blocked row distribution.")
      IF (array_size(matrix%col_blk_size) /= dbcsr_distribution_ncols(dist)) &
         DBCSR_ABORT("Number of blocked columns does match blocked column distribution.")
      ! initialize row/col offsets
      IF (PRESENT(row_blk_offset)) THEN
         IF (dbcsr_distribution_nrows(dist) + 1 /= array_size(row_blk_offset)) &
            CALL dbcsr_abort(__LOCATION__, &
                             "Number of blocked offset rows does match blocked row distribution.")
         matrix%row_blk_offset = row_blk_offset
         CALL array_hold(matrix%row_blk_offset)
      ELSE
         ALLOCATE (vec_row_blk_offset(array_size(matrix%row_blk_size) + 1))
         CALL convert_sizes_to_offsets(array_data(matrix%row_blk_size), vec_row_blk_offset)
         CALL array_new(matrix%row_blk_offset, vec_row_blk_offset, gift=.TRUE.)
      END IF

      IF (PRESENT(col_blk_offset)) THEN
         IF (dbcsr_distribution_ncols(dist) + 1 /= array_size(col_blk_offset)) &
            CALL dbcsr_abort(__LOCATION__, &
                             "Number of blocked offset columns does match blocked column distribution.")
         matrix%col_blk_offset = col_blk_offset
         CALL array_hold(matrix%col_blk_offset)
      ELSE
         ALLOCATE (vec_col_blk_offset(array_size(matrix%col_blk_size) + 1))
         CALL convert_sizes_to_offsets(array_data(matrix%col_blk_size), vec_col_blk_offset)
         CALL array_new(matrix%col_blk_offset, vec_col_blk_offset, gift=.TRUE.)
      END IF

      matrix%dist = dist
      CALL dbcsr_distribution_hold(matrix%dist)
!$    IF (.NOT. dbcsr_distribution_has_threads(matrix%dist) .AND. PRESENT(thread_dist)) THEN
!$       IF (dbcsr_distribution_has_threads(thread_dist)) THEN
!$          matrix%dist%d%num_threads = thread_dist%d%num_threads
!$          matrix%dist%d%has_thread_dist = .TRUE.
!$          matrix%dist%d%thread_dist = thread_dist%d%thread_dist
!$          CALL array_hold(matrix%dist%d%thread_dist)
!$       END IF
!$    END IF
!$    IF (.NOT. dbcsr_distribution_has_threads(matrix%dist)) THEN
!$       CALL dbcsr_distribution_make_threads(matrix%dist, &
!$                                            array_data(matrix%row_blk_size))
!$    END IF
      ! Set up some data.
      IF (my_make_index) THEN
         CALL meta_from_dist(new_meta, dist, array_data(matrix%row_blk_size), &
                             array_data(matrix%col_blk_size))
         matrix%nblkrows_total = new_meta(dbcsr_slot_nblkrows_total)
         matrix%nblkcols_total = new_meta(dbcsr_slot_nblkcols_total)
         matrix%nfullrows_total = new_meta(dbcsr_slot_nfullrows_total)
         matrix%nfullcols_total = new_meta(dbcsr_slot_nfullcols_total)
         matrix%nblkrows_local = new_meta(dbcsr_slot_nblkrows_local)
         matrix%nblkcols_local = new_meta(dbcsr_slot_nblkcols_local)
         matrix%nfullrows_local = new_meta(dbcsr_slot_nfullrows_local)
         matrix%nfullcols_local = new_meta(dbcsr_slot_nfullcols_local)
      END IF
      my_nze = 0; IF (PRESENT(nze)) my_nze = nze
      matrix%nblks = 0
      matrix%nze = 0

      IF (PRESENT(replication_type)) THEN
         IF (replication_type .NE. dbcsr_repl_none &
             .AND. replication_type .NE. dbcsr_repl_full &
             .AND. replication_type .NE. dbcsr_repl_row &
             .AND. replication_type .NE. dbcsr_repl_col) &
            DBCSR_ABORT("Invalid replication type '"//replication_type//"'")
         IF (replication_type .EQ. dbcsr_repl_row .OR. replication_type .EQ. dbcsr_repl_col) &
            DBCSR_WARN("Row and column replication not fully supported")
         matrix%replication_type = replication_type
      ELSE
         matrix%replication_type = dbcsr_repl_none
      END IF
      !
      ! Setup a matrix from scratch
      IF (.NOT. hijack) THEN
         IF (.NOT. PRESENT(data_buffer)) THEN
            CALL dbcsr_data_new(matrix%data_area, matrix%data_type, my_nze, &
                                memory_type=matrix%data_memory_type)
            CALL dbcsr_data_set_size_referenced(matrix%data_area, 0)
         END IF
         !
         IF (my_make_index) THEN
            NULLIFY (matrix%index)
            CALL ensure_array_size(matrix%index, lb=1, ub=dbcsr_num_slots, &
                                   zero_pad=.TRUE., memory_type=matrix%index_memory_type)
         END IF
      END IF
      IF (my_make_index) THEN
         IF (LBOUND(matrix%index, 1) .GT. 1 &
             .OR. UBOUND(matrix%index, 1) .LT. dbcsr_num_slots) &
            DBCSR_ABORT("Index is not large enough")
         matrix%index(1:dbcsr_num_slots) = 0
         matrix%index(1:dbcsr_meta_size) = new_meta(1:dbcsr_meta_size)
         matrix%index(dbcsr_slot_size) = dbcsr_num_slots
      END IF
      !
      matrix%symmetry = .FALSE.
      matrix%negate_real = .FALSE.
      matrix%negate_imaginary = .FALSE.
      !matrix%transpose = .FALSE.
      matrix_type_l = matrix_type
      CALL uppercase(matrix_type_l)
      SELECT CASE (matrix_type_l)
      CASE (dbcsr_type_no_symmetry)
      CASE (dbcsr_type_symmetric)
         matrix%symmetry = .TRUE.
      CASE (dbcsr_type_antisymmetric)
         matrix%symmetry = .TRUE.
         matrix%negate_real = .TRUE.
         matrix%negate_imaginary = .TRUE.
      CASE (dbcsr_type_hermitian)
         matrix%symmetry = .TRUE.
         matrix%negate_imaginary = .TRUE.
      CASE (dbcsr_type_antihermitian)
         matrix%symmetry = .TRUE.
         matrix%negate_real = .TRUE.
      CASE DEFAULT
         DBCSR_ABORT("Invalid matrix type.")
      END SELECT
      matrix%bcsc = .FALSE.
      matrix%local_indexing = .FALSE.
      matrix%list_indexing = .FALSE.
      CALL array_nullify(matrix%local_rows)
      CALL array_nullify(matrix%global_rows)
      CALL array_nullify(matrix%local_cols)
      CALL array_nullify(matrix%global_cols)
      matrix%has_local_rows = .FALSE.
      matrix%has_global_rows = .FALSE.
      matrix%has_local_cols = .FALSE.
      matrix%has_global_cols = .FALSE.
      IF (my_make_index) THEN
         CALL dbcsr_make_index_exist(matrix)
      END IF
      matrix%valid = .TRUE.
      CALL timestop(handle)
   END SUBROUTINE dbcsr_create_new

   SUBROUTINE dbcsr_create_template(matrix, template, name, dist, matrix_type, &
                                    row_blk_size, col_blk_size, row_blk_size_obj, col_blk_size_obj, &
                                    nze, data_type, &
                                    data_buffer, data_memory_type, index_memory_type, &
                                    max_rbs, max_cbs, &
                                    row_blk_offset, col_blk_offset, &
                                    reuse_arrays, mutable_work, make_index, replication_type)
      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
      TYPE(dbcsr_type), INTENT(IN)                       :: template
      CHARACTER(len=*), INTENT(IN), OPTIONAL             :: name
      TYPE(dbcsr_distribution_obj), INTENT(IN), OPTIONAL :: dist
      CHARACTER, INTENT(IN), OPTIONAL                    :: matrix_type
      INTEGER, DIMENSION(:), INTENT(INOUT), OPTIONAL, &
         POINTER, CONTIGUOUS                             :: row_blk_size, col_blk_size
      TYPE(array_i1d_obj), INTENT(IN), OPTIONAL          :: row_blk_size_obj, col_blk_size_obj
      INTEGER, INTENT(IN), OPTIONAL                      :: nze, data_type
      TYPE(dbcsr_data_obj), INTENT(IN), OPTIONAL         :: data_buffer
      TYPE(dbcsr_memtype_type), INTENT(IN), OPTIONAL     :: data_memory_type, index_memory_type
      INTEGER, INTENT(IN), OPTIONAL                      :: max_rbs, max_cbs
      TYPE(array_i1d_obj), INTENT(IN), OPTIONAL          :: row_blk_offset, col_blk_offset
      LOGICAL, INTENT(IN), OPTIONAL                      :: reuse_arrays, mutable_work, make_index
      CHARACTER, INTENT(IN), OPTIONAL                    :: replication_type

      CHARACTER                                          :: new_matrix_type, new_replication_type
      CHARACTER(len=default_string_length)               :: new_name
      INTEGER                                            :: new_data_type, new_max_cbs, new_max_rbs
      LOGICAL                                            :: my_make_index, new_mutable_work
      TYPE(array_i1d_obj)                                :: new_col_blk_offset, new_row_blk_offset
      TYPE(dbcsr_distribution_obj)                       :: new_dist
      TYPE(dbcsr_memtype_type)                           :: new_data_memory_type, &
                                                            new_index_memory_type

      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: blk_size

!   ---------------------------------------------------------------------------

      IF (PRESENT(name)) THEN
         new_name = TRIM(name)
      ELSE
         new_name = TRIM(dbcsr_name(template))
      END IF
      IF (PRESENT(dist)) THEN
         new_dist = dist
      ELSE
         new_dist = dbcsr_distribution(template)
      END IF
      IF (PRESENT(matrix_type)) THEN
         new_matrix_type = matrix_type
      ELSE
         new_matrix_type = dbcsr_get_matrix_type(template)
      END IF
      !
      IF ((PRESENT(row_blk_size) .NEQV. PRESENT(col_blk_size)) .OR. &
          (PRESENT(row_blk_size_obj) .NEQV. PRESENT(col_blk_size_obj))) THEN
         DBCSR_ABORT("Both row_blk_size and col_blk_size must be provided")
      END IF
      !
      IF (PRESENT(max_rbs)) new_max_rbs = max_rbs
      IF (PRESENT(row_blk_offset)) new_row_blk_offset = row_blk_offset
      NULLIFY (blk_size)
      IF (PRESENT(row_blk_size_obj)) THEN
         blk_size => array_data(row_blk_size_obj)
      ELSEIF (PRESENT(row_blk_size)) THEN
         blk_size => row_blk_size
      END IF
      IF (ASSOCIATED(blk_size)) THEN
         IF (.NOT. PRESENT(max_rbs)) &
            new_max_rbs = MAXVAL(blk_size)
      ELSE
         IF (.NOT. PRESENT(max_rbs)) &
            new_max_rbs = dbcsr_max_row_size(template)
         IF (.NOT. PRESENT(row_blk_offset)) &
            new_row_blk_offset = template%row_blk_offset
      END IF
      !
      IF (PRESENT(max_cbs)) new_max_cbs = max_cbs
      IF (PRESENT(col_blk_offset)) new_col_blk_offset = col_blk_offset
      NULLIFY (blk_size)
      IF (PRESENT(col_blk_size_obj)) THEN
         blk_size => array_data(col_blk_size_obj)
      ELSEIF (PRESENT(col_blk_size)) THEN
         blk_size => col_blk_size
      END IF
      IF (ASSOCIATED(blk_size)) THEN
         IF (.NOT. PRESENT(max_cbs)) &
            new_max_cbs = MAXVAL(blk_size)
      ELSE
         IF (.NOT. PRESENT(max_cbs)) &
            new_max_cbs = dbcsr_max_col_size(template)
         IF (.NOT. PRESENT(col_blk_offset)) &
            new_col_blk_offset = template%col_blk_offset
      END IF
      IF (PRESENT(data_type)) THEN
         new_data_type = data_type
      ELSE
         new_data_type = dbcsr_get_data_type(template)
      END IF
      IF (PRESENT(data_memory_type)) THEN
         new_data_memory_type = data_memory_type
      ELSE
         new_data_memory_type = dbcsr_get_data_memory_type(template)
      END IF
      IF (PRESENT(index_memory_type)) THEN
         new_index_memory_type = index_memory_type
      ELSE
         new_index_memory_type = dbcsr_get_index_memory_type(template)
      END IF
      IF (PRESENT(replication_type)) THEN
         new_replication_type = replication_type
      ELSE
         new_replication_type = dbcsr_get_replication_type(template)
      END IF
      IF (PRESENT(mutable_work)) THEN
         new_mutable_work = mutable_work
      ELSE
         new_mutable_work = dbcsr_use_mutable(template)
      END IF
      IF (PRESENT(row_blk_size_obj)) THEN
         CALL dbcsr_create(matrix, name=new_name, dist=new_dist, &
                           matrix_type=new_matrix_type, &
                           row_blk_size_obj=row_blk_size_obj, &
                           col_blk_size_obj=col_blk_size_obj, &
                           nze=nze, &
                           data_type=new_data_type, &
                           data_buffer=data_buffer, &
                           data_memory_type=new_data_memory_type, &
                           index_memory_type=new_index_memory_type, &
                           max_rbs=new_max_rbs, max_cbs=new_max_cbs, &
                           row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset, &
                           reuse_arrays=reuse_arrays, &
                           mutable_work=new_mutable_work, &
                           make_index=make_index, &
                           replication_type=new_replication_type)
      ELSEIF (PRESENT(row_blk_size)) THEN
         CALL dbcsr_create(matrix, name=new_name, dist=new_dist, &
                           matrix_type=new_matrix_type, &
                           row_blk_size=row_blk_size, &
                           col_blk_size=col_blk_size, &
                           nze=nze, &
                           data_type=new_data_type, &
                           data_buffer=data_buffer, &
                           data_memory_type=new_data_memory_type, &
                           index_memory_type=new_index_memory_type, &
                           max_rbs=new_max_rbs, max_cbs=new_max_cbs, &
                           row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset, &
                           reuse_arrays=reuse_arrays, &
                           mutable_work=new_mutable_work, &
                           make_index=make_index, &
                           replication_type=new_replication_type)
      ELSE
         CALL dbcsr_create(matrix, name=new_name, dist=new_dist, &
                           matrix_type=new_matrix_type, &
                           row_blk_size_obj=template%row_blk_size, &
                           col_blk_size_obj=template%col_blk_size, &
                           nze=nze, &
                           data_type=new_data_type, &
                           data_buffer=data_buffer, &
                           data_memory_type=new_data_memory_type, &
                           index_memory_type=new_index_memory_type, &
                           max_rbs=new_max_rbs, max_cbs=new_max_cbs, &
                           row_blk_offset=new_row_blk_offset, col_blk_offset=new_col_blk_offset, &
                           thread_dist=dbcsr_distribution(template), &
                           reuse_arrays=reuse_arrays, &
                           mutable_work=new_mutable_work, &
                           make_index=make_index, &
                           replication_type=new_replication_type)
      END IF
      ! Copy stuff from the meta-array.  These are not normally needed,
      ! but have to be here for creating matrices from "image" matrices.
      my_make_index = .TRUE.
      IF (PRESENT(make_index)) my_make_index = make_index
      IF (my_make_index) THEN
         matrix%index(dbcsr_slot_home_prow) = template%index(dbcsr_slot_home_prow)
         matrix%index(dbcsr_slot_home_rowi) = template%index(dbcsr_slot_home_rowi)
         matrix%index(dbcsr_slot_home_pcol) = template%index(dbcsr_slot_home_pcol)
         matrix%index(dbcsr_slot_home_coli) = template%index(dbcsr_slot_home_coli)
         matrix%index(dbcsr_slot_home_vprow) = template%index(dbcsr_slot_home_vprow)
         matrix%index(dbcsr_slot_home_vpcol) = template%index(dbcsr_slot_home_vpcol)
      END IF
      IF (PRESENT(row_blk_size) .AND. .NOT. PRESENT(row_blk_offset)) THEN
         CALL array_release(new_row_blk_offset)
      END IF
      IF (PRESENT(col_blk_size) .AND. .NOT. PRESENT(col_blk_offset)) THEN
         CALL array_release(new_col_blk_offset)
      END IF

   END SUBROUTINE dbcsr_create_template

   SUBROUTINE dbcsr_init_wm(wm, data_type, nblks_guess, sizedata_guess, memory_type)
      !! Initializes one work matrix

      TYPE(dbcsr_work_type), INTENT(OUT)                 :: wm
         !! initialized work matrix
      INTEGER, INTENT(IN)                                :: data_type
      INTEGER, INTENT(IN), OPTIONAL                      :: nblks_guess, sizedata_guess
         !! estimated number of blocks
         !! estimated size of data
      TYPE(dbcsr_memtype_type), INTENT(IN), OPTIONAL     :: memory_type

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_init_wm'
      INTEGER                                            :: handle, nblks, stat

!   ---------------------------------------------------------------------------

      IF (careful_mod) CALL timeset(routineN, handle)
      wm%lastblk = 0
      wm%datasize = 0
      ! Index
      IF (PRESENT(nblks_guess)) THEN
         nblks = nblks_guess
         IF (nblks_guess < 0) &
            DBCSR_ABORT("Can not have negative block count.")
         ALLOCATE (wm%row_i(nblks), stat=stat)
         IF (stat /= 0) DBCSR_ABORT("wm%row_i")
         ALLOCATE (wm%col_i(nblks), stat=stat)
         IF (stat /= 0) DBCSR_ABORT("wm%col_i")
         ALLOCATE (wm%blk_p(nblks), stat=stat)
         IF (stat /= 0) DBCSR_ABORT("wm%blk_p")
      ELSE
         NULLIFY (wm%row_i, wm%col_i, wm%blk_p)
         !nblks = CEILING (REAL (matrix%nblkrows_local * matrix%nblkcols_local)&
         !     / REAL (dbcsr_mp_numnodes (dbcsr_distribution_mp (matrix%dist))))
      END IF
      ! Data
      CALL dbcsr_data_init(wm%data_area)
      IF (PRESENT(sizedata_guess)) THEN
         IF (sizedata_guess < 0) &
            DBCSR_ABORT("Can not have negative data size.")
         CALL dbcsr_data_new(wm%data_area, data_type, &
                             data_size=sizedata_guess, memory_type=memory_type)
      ELSE
         CALL dbcsr_data_new(wm%data_area, data_type, memory_type=memory_type)
      END IF
      CALL dbcsr_mutable_init(wm%mutable)
      IF (careful_mod) CALL timestop(handle)
   END SUBROUTINE dbcsr_init_wm

   SUBROUTINE dbcsr_work_create(matrix, nblks_guess, sizedata_guess, n, &
                                work_mutable, memory_type)
      !! Creates a the working matrix(es) for a DBCSR matrix.

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! new matrix
      INTEGER, INTENT(IN), OPTIONAL                      :: nblks_guess, sizedata_guess, n
         !! estimated number of blocks
         !! estimated size of data
         !! number work matrices to create, default is 1
      LOGICAL, INTENT(in), OPTIONAL                      :: work_mutable
         !! use mutable work type, default is what was specified in create
      TYPE(dbcsr_memtype_type), INTENT(IN), OPTIONAL     :: memory_type

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_work_create'

      INTEGER                                            :: handle, iw, nw, ow
      LOGICAL                                            :: wms_new, wms_realloc
      TYPE(dbcsr_work_type), DIMENSION(:), POINTER       :: wms

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      IF (PRESENT(n)) THEN
         nw = n
      ELSE
         nw = 1
!$       IF (omp_in_parallel()) THEN
!$          nw = omp_get_num_threads()
!$       ELSE
!$          nw = omp_get_max_threads()
!$       END IF
      END IF
!$OMP     MASTER
      wms_new = .NOT. ASSOCIATED(matrix%wms)
      wms_realloc = .FALSE.
      IF (ASSOCIATED(matrix%wms)) THEN
         ow = SIZE(matrix%wms)
         IF (ow .LT. nw) &
            DBCSR_WARN("Number of work matrices less than threads.")
         IF (ow .LT. nw) wms_realloc = .TRUE.
      END IF
      IF (PRESENT(work_mutable)) THEN
         matrix%work_mutable = work_mutable
      END IF
      IF (wms_realloc) THEN
         ALLOCATE (wms(nw))
         wms(1:ow) = matrix%wms(1:ow)
         DEALLOCATE (matrix%wms)
         matrix%wms => wms
         DO iw = ow + 1, nw
            CALL dbcsr_init_wm(matrix%wms(iw), matrix%data_type, &
                               nblks_guess=nblks_guess, sizedata_guess=sizedata_guess, &
                               memory_type=memory_type)
            IF (matrix%work_mutable) &
               CALL dbcsr_mutable_new(matrix%wms(iw)%mutable, &
                                      dbcsr_get_data_type(matrix))
         END DO
      END IF
      IF (wms_new) THEN
         ALLOCATE (matrix%wms(nw))
         DO iw = 1, nw
            CALL dbcsr_init_wm(matrix%wms(iw), matrix%data_type, &
                               nblks_guess=nblks_guess, sizedata_guess=sizedata_guess, &
                               memory_type=memory_type)
            IF (matrix%work_mutable) &
               CALL dbcsr_mutable_new(matrix%wms(iw)%mutable, &
                                      dbcsr_get_data_type(matrix))
         END DO
      END IF
      matrix%valid = .FALSE.
!$OMP     END MASTER
      CALL timestop(handle)
   END SUBROUTINE dbcsr_work_create

   SUBROUTINE dbcsr_finalize(matrix, reshuffle)
      !! Creates the final dbcsr_type matrix from the working matrix.
      !! Work matrices (array or tree-based) are merged into the base DBCSR matrix.
      !! If a matrix is marked as having a valid index, then nothing is done.
      !! Deleted blocks are pruned from the index.

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! final matrix
      LOGICAL, INTENT(IN), OPTIONAL                      :: reshuffle
         !! whether the data should be reshuffled, default is false

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_finalize'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle, i, nblks, nwms, start_offset
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: empty_row_p
      INTEGER, DIMENSION(:), POINTER, SAVE               :: old_blk_p, old_col_i, old_row_p
      LOGICAL                                            :: can_quick, fake_row_p, sort_data, spawn

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

!$OMP     MASTER
      NULLIFY (old_blk_p, old_col_i, old_row_p)
!$OMP     END MASTER

!$OMP     BARRIER
      ! If the matrix is not marked as dirty then skip the work.
      IF (dbcsr_valid_index(matrix)) THEN
         !"No need to finalize a valid matrix, skipping."
         !
         ! A matrix with a valid index should not have associated work
         ! arrays.  This may happen when this routine is called on a
         ! matrix that was not changed.
!$OMP        BARRIER
!$OMP        MASTER
         IF (ASSOCIATED(matrix%wms)) &
            CALL dbcsr_work_destroy_all(matrix)
         matrix%valid = .TRUE.
!$OMP        END MASTER
!$OMP        BARRIER
         CALL timestop(handle)
         RETURN
      END IF
      !
      ! If possible, data copying is avoided.
      IF (PRESENT(reshuffle)) THEN
         sort_data = reshuffle
      ELSE
         sort_data = .FALSE.
      END IF
      !
      ! Now make sure that a valid row_p exists. Also clear the row_p if
      ! the matrix is declared to have 0 blocks.
!$OMP     MASTER
      fake_row_p = .NOT. ASSOCIATED(matrix%row_p)
      IF (ASSOCIATED(matrix%row_p)) THEN
         fake_row_p = SIZE(matrix%row_p) .LE. 1
      END IF
      fake_row_p = fake_row_p .OR. matrix%nblks .EQ. 0
!$OMP     END MASTER
      !
      ! See where data will be appended in the main data
      ! area. Alternatively, set to the start if the matrix is declared
      ! to have no data. (This value is ignored if reshuffle is true
      ! because the main data area is always new.)
      start_offset = matrix%nze
      i = dbcsr_get_data_size_used(matrix)
!$OMP     MASTER
      matrix%nze = 0
!$OMP     END MASTER
!$OMP     BARRIER
!$OMP     ATOMIC
      matrix%nze = matrix%nze + i
!$OMP     BARRIER
      IF (dbg) THEN
         WRITE (*, *) routineN//" sizes", matrix%nze, i, &
            dbcsr_data_get_size_referenced(matrix%data_area), &
            dbcsr_data_get_size(matrix%data_area)
      END IF
      IF (.FALSE. .AND. dbcsr_data_get_size_referenced(matrix%data_area) .NE. &
          matrix%nze) THEN
         IF (matrix%nze .NE. dbcsr_data_get_size_referenced(matrix%data_area)) &
            DBCSR_WARN("Should reshuffle.")
         IF (ASSOCIATED(matrix%wms)) THEN
            sort_data = .NOT. dbcsr_wm_use_mutable(matrix%wms(1))
         END IF
      END IF
      IF (sort_data .AND. matrix%nze .GT. 0) THEN
         CALL dbcsr_add_wm_from_matrix(matrix)
         matrix%nze = 0
!$OMP        MASTER
         fake_row_p = .TRUE.
!$OMP        END MASTER
      END IF
      start_offset = dbcsr_data_get_size_referenced(matrix%data_area) + 1
      IF (matrix%nze .EQ. 0) start_offset = 1
!$OMP     MASTER
      matrix%index(dbcsr_slot_nze) = matrix%nze
      IF (fake_row_p) THEN
         ALLOCATE (empty_row_p(matrix%nblkrows_total + 1))
         empty_row_p(:) = 0
         CALL dbcsr_addto_index_array(matrix, dbcsr_slot_row_p, &
                                      DATA=empty_row_p, extra=0)
         CALL dbcsr_addto_index_array(matrix, dbcsr_slot_col_i, &
                                      reservation=0)
         CALL dbcsr_addto_index_array(matrix, dbcsr_slot_blk_p, &
                                      reservation=0)
         CALL dbcsr_repoint_index(matrix)
      END IF
!$OMP     END MASTER
      !
!$OMP     BARRIER
      can_quick = can_quickly_finalize(matrix)
!$OMP     BARRIER
      ! If the matrix, work matrices, and environment fit several
      ! criteria, then a quick O(1) finalization is performed.
      IF (can_quick .AND. .NOT. sort_data) THEN
         CALL quick_finalize(matrix)
      ELSE
         !
!$OMP        MASTER
         !
         ! Create work matrices if not yet existing
         IF (.NOT. ASSOCIATED(matrix%wms)) THEN
            nwms = 1
!$          nwms = omp_get_num_threads()
            CALL dbcsr_work_create(matrix, n=nwms)
         END IF
!$OMP        END MASTER
!$OMP        BARRIER
         !
         ! Ensure index arrays at least exist.
!$OMP        DO SCHEDULE (STATIC, 1)
         DO i = 1, SIZE(matrix%wms)
            IF (.NOT. ASSOCIATED(matrix%wms(i)%row_i)) THEN
               CALL ensure_array_size(matrix%wms(i)%row_i, ub=0)
            END IF
            IF (.NOT. ASSOCIATED(matrix%wms(i)%col_i)) THEN
               CALL ensure_array_size(matrix%wms(i)%col_i, ub=0)
            END IF
            IF (.NOT. ASSOCIATED(matrix%wms(i)%blk_p)) THEN
               CALL ensure_array_size(matrix%wms(i)%blk_p, ub=0)
            END IF
         END DO
!$OMP        ENDDO
         !
         ! Check for deleted blocks
!$OMP        MASTER
         nblks = matrix%row_p(matrix%nblkrows_total + 1)
         IF (ANY(matrix%blk_p(1:nblks) .EQ. 0)) THEN
            CALL dbcsr_index_prune_deleted(matrix)
         END IF
         old_row_p => matrix%row_p
         old_col_i => matrix%col_i
         old_blk_p => matrix%blk_p
!$OMP        END MASTER
         !
!$OMP        BARRIER
         ! Check to see if we will need to create a parallel environment
         ! (needed when there are multiple work matrices but we are not
         ! in an OpenMP parallel section.)
         !
         ! A parallel section is created and used when the matrix has
         ! more work matrices. It's a shortcut when the finalize is
         ! called from a non-parallel environment whereas the matrix was
         ! built/modified in a parallel environment
         nwms = SIZE(matrix%wms)
         spawn = .FALSE.
!$       IF (.NOT. omp_in_parallel()) THEN
!$          IF (nwms .GT. 1) spawn = .TRUE.
!$       END IF
         IF (spawn) THEN
!$OMP           PARALLEL IF (spawn) &
!$OMP                    DEFAULT (NONE) &
!$OMP                    SHARED (matrix, old_row_p, old_col_i, old_blk_p,&
!$OMP                            start_offset, sort_data)
            CALL dbcsr_merge_all(matrix, &
                                 old_row_p, old_col_i, old_blk_p, &
                                 sort_data=sort_data)
!$OMP           END PARALLEL
         ELSE
            CALL dbcsr_merge_all(matrix, &
                                 old_row_p, old_col_i, old_blk_p, &
                                 sort_data=sort_data)
         END IF
      END IF
!$OMP BARRIER
!$OMP MASTER
      ! Clean up.
      IF (ASSOCIATED(matrix%wms)) THEN
         CALL dbcsr_work_destroy_all(matrix)
      END IF
      matrix%valid = .TRUE.
!$OMP END MASTER
!$OMP BARRIER
      IF (dbg) THEN
!$OMP        SINGLE
         CALL dbcsr_verify_matrix(matrix)
!$OMP        END SINGLE
      END IF
!$OMP MASTER
      IF (fake_row_p) THEN
         DEALLOCATE (empty_row_p)
      END IF
!$OMP END MASTER
!$OMP BARRIER
      CALL timestop(handle)
   END SUBROUTINE dbcsr_finalize

   SUBROUTINE dbcsr_special_finalize(matrix, reshuffle)
      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
      LOGICAL, INTENT(IN), OPTIONAL                      :: reshuffle

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_special_finalize'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle
      LOGICAL                                            :: can_quick, sort_data

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

      IF (matrix%nblks .NE. 0) &
         DBCSR_ABORT("Optimized finalize requires empty matrix.")
      IF (dbcsr_valid_index(matrix)) &
         DBCSR_ABORT("Optimized finalize requires invalid matrix.")
      IF (.NOT. ASSOCIATED(matrix%wms)) &
         DBCSR_ABORT("Optimized finalize requires work matrices exist.")
      IF (SIZE(matrix%wms) .NE. 1) &
         DBCSR_ABORT("Optimized finalize requires single work matrix")

      !
      ! If possible, data copying is avoided.
      IF (PRESENT(reshuffle)) THEN
         sort_data = reshuffle
      ELSE
         sort_data = .FALSE.
      END IF
!$OMP     BARRIER
      can_quick = can_quickly_finalize(matrix)
!$OMP     BARRIER

      ! If the matrix, work matrices, and environment fit several
      ! criteria, then a quick O(1) finalization is performed.
      IF (can_quick .AND. .NOT. sort_data) THEN
         CALL quick_finalize(matrix)
      ELSE
         IF (.NOT. sort_data) &
            DBCSR_ABORT("merge_single_wm only supports data sorting")
         !
         ! Ensure needed index arrays at least exist.
!$OMP        MASTER
         !
         IF (.NOT. ASSOCIATED(matrix%wms(1)%row_i)) THEN
            CALL ensure_array_size(matrix%wms(1)%row_i, ub=0)
         END IF
         IF (.NOT. ASSOCIATED(matrix%wms(1)%col_i)) THEN
            CALL ensure_array_size(matrix%wms(1)%col_i, ub=0)
         END IF
         IF (.NOT. ASSOCIATED(matrix%wms(1)%blk_p)) THEN
            CALL ensure_array_size(matrix%wms(1)%blk_p, ub=0)
         END IF
         !
!$OMP        END MASTER
!$OMP        BARRIER
         !
!$OMP        PARALLEL DEFAULT (NONE), SHARED(matrix)
         CALL dbcsr_merge_single_wm(matrix)
!$OMP        END PARALLEL

      END IF

!$OMP     MASTER
      ! Clean up.
      IF (ASSOCIATED(matrix%wms)) THEN
         CALL dbcsr_work_destroy_all(matrix)
      END IF
      matrix%valid = .TRUE.
!$OMP     END MASTER
!$OMP     BARRIER
      IF (dbg) THEN
!$OMP        SINGLE
         CALL dbcsr_verify_matrix(matrix)
!$OMP        END SINGLE
      END IF
!$OMP     BARRIER
      CALL timestop(handle)
   END SUBROUTINE dbcsr_special_finalize

   FUNCTION can_quickly_finalize(matrix) RESULT(quick)
      !! Checks whether the matrix can be finalized with minimal copying.

      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
         !! matrix to check
      LOGICAL                                            :: quick
         !! whether matrix can be quickly finalized

!   ---------------------------------------------------------------------------

      IF (ASSOCIATED(matrix%wms)) THEN
         quick = matrix%nblks .EQ. 0
         quick = quick .AND. SIZE(matrix%wms) .EQ. 1 .AND. &
                 .NOT. dbcsr_wm_use_mutable(matrix%wms(1))
         IF (quick) THEN
            quick = quick .AND. &
                    dbcsr_memtype_equal( &
                    dbcsr_data_get_memory_type(matrix%wms(1)%data_area), &
                    dbcsr_data_get_memory_type(matrix%data_area))
            quick = quick .AND. &
                    ASSOCIATED(matrix%wms(1)%row_i)
            quick = quick .AND. &
                    (matrix%wms(1)%datasize_after_filtering .LT. 0 .OR. &
                     matrix%wms(1)%datasize .EQ. matrix%wms(1)%datasize_after_filtering)
         END IF
      ELSE
         quick = .FALSE.
      END IF
   END FUNCTION can_quickly_finalize

   SUBROUTINE quick_finalize(matrix)
      !! Performs quick finalization of matrix
      !! The data area from the work matrix is accepted as the new matrix's data
      !! area and the index is built from the work matrix.

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! matrix to finalize

      CHARACTER(len=*), PARAMETER :: routineN = 'quick_finalize'
      INTEGER                                            :: error_handle, nblks, nrows

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, error_handle)
!$OMP     SECTIONS
!$OMP     SECTION
      nblks = matrix%wms(1)%lastblk
      nrows = matrix%nblkrows_total
      CALL dbcsr_sort_indices(nblks, &
                              matrix%wms(1)%row_i, &
                              matrix%wms(1)%col_i, &
                              matrix%wms(1)%blk_p)
      CALL dbcsr_clearfrom_index_array(matrix, dbcsr_slot_row_p)
      CALL dbcsr_clearfrom_index_array(matrix, dbcsr_slot_col_i)
      CALL dbcsr_clearfrom_index_array(matrix, dbcsr_slot_blk_p)
      CALL dbcsr_addto_index_array(matrix, dbcsr_slot_row_p, &
                                   reservation=nrows + 1, extra=2*nblks)
      CALL dbcsr_make_dbcsr_index(matrix%row_p, matrix%wms(1)%row_i, &
                                  nrows, nblks)
      CALL dbcsr_addto_index_array(matrix, dbcsr_slot_col_i, &
                                   DATA=matrix%wms(1)%col_i(1:nblks))
      CALL dbcsr_addto_index_array(matrix, dbcsr_slot_blk_p, &
                                   DATA=matrix%wms(1)%blk_p(1:nblks))
      matrix%nblks = nblks
      matrix%nze = matrix%wms(1)%datasize
      matrix%index(dbcsr_slot_nblks) = nblks
      matrix%index(dbcsr_slot_nze) = matrix%wms(1)%datasize
      CALL dbcsr_repoint_index(matrix)
!$OMP     SECTION
      CALL dbcsr_switch_data_area(matrix, matrix%wms(1)%data_area)
!$OMP     END SECTIONS
      CALL timestop(error_handle)
   END SUBROUTINE quick_finalize

   SUBROUTINE dbcsr_add_wm_from_matrix(matrix, limits)
      !! Creates a work matrix from the data present in a finalized matrix.

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! DBCSR matrix
      INTEGER, DIMENSION(4), INTENT(IN), OPTIONAL        :: limits
         !! the limits to use for copying

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_add_wm_from_matrix'

      INTEGER                                            :: handle, ithread, nthreads, nwms, &
                                                            old_nwms, size_used
      LOGICAL                                            :: preexists

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
!$OMP     BARRIER
      preexists = ASSOCIATED(matrix%wms)
      IF (preexists) THEN
         old_nwms = SIZE(matrix%wms)
         IF (old_nwms .EQ. 0) &
            DBCSR_WARN("Nonexisting work matrices?!")
      ELSE
         old_nwms = 0
      END IF
      nthreads = 1; ithread = 0
!$    nthreads = OMP_GET_NUM_THREADS(); ithread = OMP_GET_THREAD_NUM()
      IF (nthreads .GT. 1) THEN
         IF (old_nwms /= nthreads .AND. old_nwms /= 0) &
            DBCSR_ABORT("Number of work matrices and threads do not match")
      END IF
      nwms = MAX(1, old_nwms)
!$    nwms = MAX(nwms, nthreads)
!$OMP     BARRIER
!$OMP     MASTER
      IF (.NOT. ASSOCIATED(matrix%wms)) THEN
         CALL dbcsr_work_create(matrix, &
                                INT(matrix%nblks*default_resize_factor/nwms), &
                                INT(matrix%nze*default_resize_factor/nwms), &
                                n=nwms, work_mutable=.FALSE.)
      END IF
!$OMP     END MASTER
!$OMP     BARRIER
      size_used = matrix%nze
      CALL dbcsr_fill_wm_from_matrix(matrix%wms, matrix, size_used, &
                                     limits=limits)
!$OMP     BARRIER
      CALL timestop(handle)
   END SUBROUTINE dbcsr_add_wm_from_matrix

   SUBROUTINE dbcsr_fill_wm_from_matrix(wm, matrix, size_used, limits)
      !! Fills index and data of the work matrix from the
      !! previously-finalized one.
      !!
      !! limits
      !! The limits is a 4-tuple
      !! (lower_row, higher_row, lower_column, higher_column).

      TYPE(dbcsr_work_type), DIMENSION(:), INTENT(INOUT) :: wm
         !! the work matrix to fill
      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! DBCSR matrix
      INTEGER, INTENT(IN)                                :: size_used
      INTEGER, DIMENSION(4), INTENT(IN), OPTIONAL        :: limits
         !! only fills blocks within this range

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_fill_wm_from_matrix'

      INTEGER                                            :: blk, blk_p, col, handle, ithread, &
                                                            nthreads, nwms, nze, row, wblk_p, &
                                                            which_wm, wm_first, wm_last
      LOGICAL                                            :: careful, limit, mt, tr
      LOGICAL, SAVE                                      :: mutable
      TYPE(dbcsr_data_obj)                               :: data_block
      TYPE(dbcsr_iterator)                               :: iter

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      nwms = SIZE(matrix%wms)
      mt = .FALSE.
!$    IF (nwms .GT. 1) mt = omp_get_num_threads() .GT. 1
      ithread = 0; nthreads = 1
!$    ithread = omp_get_thread_num()
!$    nthreads = omp_get_num_threads()
      limit = PRESENT(limits)
      careful = size_used + size_used/8 &
                .LT. dbcsr_data_get_size_referenced(matrix%data_area)
      CALL dbcsr_data_init(data_block)
      CALL dbcsr_data_new(data_block, dbcsr_data_get_type(matrix%data_area))
      IF (mt) THEN
         wm_first = ithread + 1
         wm_last = ithread + 1
      ELSE
         wm_first = 1
         wm_last = nwms
      END IF
      ! Prepares the work matrices to accept the main data.
!$OMP     MASTER
      mutable = .FALSE.
      DO which_wm = 1, nwms
         mutable = mutable .OR. dbcsr_wm_use_mutable(wm(which_wm))
      END DO
!$OMP     END MASTER
!$OMP     BARRIER
      DO which_wm = wm_first, wm_last
         IF (dbcsr_wm_use_mutable(wm(which_wm))) &
            DBCSR_ABORT("Adding main matrix into mutable not supported.")
         IF (mutable) THEN
            IF (.NOT. dbcsr_mutable_instantiated(wm(which_wm)%mutable)) THEN
               CALL dbcsr_mutable_new(wm(which_wm)%mutable, matrix%data_type)
            END IF
         ELSE
            ! We don't know how much data we'll get so we have to be generous.
            CALL dbcsr_data_ensure_size(wm(which_wm)%data_area, &
                                        size_used/nwms)
            CALL dbcsr_data_set_size_referenced(wm(which_wm)%data_area, 0)
         END IF
      END DO
      ! Now copy the data
      CALL dbcsr_iterator_start(iter, matrix, shared=mt, &
                                contiguous_pointers=.FALSE., read_only=.TRUE.)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, data_block, &
                                        transposed=tr, block_number=blk)
         IF (limit) THEN
            IF (.NOT. within_limits(row, col, limits)) CYCLE
         END IF
         blk_p = matrix%blk_p(blk)
         which_wm = ithread + 1
         wblk_p = SIGN(wm(which_wm)%datasize + 1, blk_p)
         nze = dbcsr_data_get_size(data_block)
         IF (mt .OR. limit .OR. careful .OR. mutable) THEN
            ! The data gets copied block by block so the block pointers
            ! are ordered accordingly.
            IF (.NOT. mutable) THEN
               CALL add_work_coordinate(wm(which_wm), row, col, wblk_p)
               CALL dbcsr_data_ensure_size(wm(which_wm)%data_area, &
                                           ABS(wblk_p) + nze - 1, factor=default_resize_factor)
               CALL dbcsr_data_set_size_referenced(wm(which_wm)%data_area, &
                                                   ABS(wblk_p) + nze - 1)
               CALL dbcsr_data_set(wm(which_wm)%data_area, &
                                   lb=ABS(wblk_p), &
                                   data_size=nze, &
                                   src=data_block, source_lb=1)
            END IF
         ELSE
            ! The data gets copied all at once so the block pointers
            ! should remain the same as they were.
            CALL add_work_coordinate(wm(which_wm), row, col, blk_p)
         END IF
         IF (.NOT. mutable) &
            wm(which_wm)%datasize = wm(which_wm)%datasize + nze
      END DO
      CALL dbcsr_iterator_stop(iter)
      CALL dbcsr_data_clear_pointer(data_block)
      CALL dbcsr_data_release(data_block)
      ! Copy all blocks at once
      IF (.NOT. mt .AND. .NOT. limit .AND. .NOT. careful .AND. .NOT. mutable) THEN
         DO which_wm = 1, nwms
            CALL dbcsr_data_ensure_size(wm(which_wm)%data_area, &
                                        dbcsr_data_get_size_referenced(matrix%data_area))
            CALL dbcsr_data_copyall(wm(which_wm)%data_area, matrix%data_area)
            wm(which_wm)%datasize = size_used
         END DO
      END IF
      CALL timestop(handle)
   END SUBROUTINE dbcsr_fill_wm_from_matrix

   PURE FUNCTION within_limits(row, column, limits)
      !! Checks whether a point is within bounds
      !! \return whether the point is within the bounds

      INTEGER, INTENT(IN)                                :: row, column
         !! point to check
         !! point to check
      INTEGER, DIMENSION(4), INTENT(IN)                  :: limits
         !! limits (low_row, high_row, low_col, high_col)
      LOGICAL                                            :: within_limits

      within_limits = row .GE. limits(1) .AND. row .LE. limits(2) .AND. &
                      column .GE. limits(3) .AND. column .LE. limits(4)
   END FUNCTION within_limits

   SUBROUTINE dbcsr_work_destroy(wm)
      !! Deallocates and destroys a work matrix.

      TYPE(dbcsr_work_type), INTENT(INOUT)               :: wm
         !! work matrix

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_work_destroy'

      INTEGER                                            :: handle

!   ---------------------------------------------------------------------------

      IF (careful_mod) CALL timeset(routineN, handle)

      IF (ASSOCIATED(wm%row_i)) THEN
         DEALLOCATE (wm%row_i)
      END IF
      IF (ASSOCIATED(wm%col_i)) THEN
         DEALLOCATE (wm%col_i)
      END IF
      IF (ASSOCIATED(wm%blk_p)) THEN
         DEALLOCATE (wm%blk_p)
      END IF
      CALL dbcsr_data_release(wm%data_area)
      CALL dbcsr_mutable_destroy(wm%mutable)

      IF (careful_mod) CALL timestop(handle)

   END SUBROUTINE dbcsr_work_destroy

   SUBROUTINE dbcsr_work_destroy_all(m)
      !! Deallocates and destroys a work matrix.
      TYPE(dbcsr_type), INTENT(INOUT)                    :: m

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_work_destroy_all'

      INTEGER                                            :: handle, i

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

      IF (.NOT. ASSOCIATED(m%wms)) &
         DBCSR_WARN("Want to destroy nonexisting work matrices.")
      IF (ASSOCIATED(m%wms)) THEN
         DO i = 1, SIZE(m%wms)
            CALL dbcsr_work_destroy(m%wms(i))
         END DO
         DEALLOCATE (m%wms)
      END IF

      CALL timestop(handle)

   END SUBROUTINE dbcsr_work_destroy_all

   SUBROUTINE add_work_coordinate(matrix, row, col, blk, index)
      !! Adds a coordinate (or other data) into a work matrix's row_i and
      !! col_i arrays and returns its position.
      !! @note  Uses the matrix%lastblk to keep track of the current position.

      TYPE(dbcsr_work_type), INTENT(INOUT)               :: matrix
         !! work matrix
      INTEGER, INTENT(IN)                                :: row, col
         !! row data to add
         !! col data to add
      INTEGER, INTENT(IN), OPTIONAL                      :: blk
         !! block pointer to add
      INTEGER, INTENT(OUT), OPTIONAL                     :: index
         !! saved position

      CHARACTER(len=*), PARAMETER :: routineN = 'add_work_coordinate'

      INTEGER                                            :: handle

!   ---------------------------------------------------------------------------

      IF (careful_mod) CALL timeset(routineN, handle)
      matrix%lastblk = matrix%lastblk + 1
      CALL ensure_array_size(matrix%row_i, ub=matrix%lastblk, &
                             factor=default_resize_factor)
      CALL ensure_array_size(matrix%col_i, ub=matrix%lastblk, &
                             factor=default_resize_factor)
      matrix%row_i(matrix%lastblk) = row
      matrix%col_i(matrix%lastblk) = col
      IF (PRESENT(blk)) THEN
         CALL ensure_array_size(matrix%blk_p, ub=matrix%lastblk, &
                                factor=default_resize_factor)
         matrix%blk_p(matrix%lastblk) = blk
      END IF
      IF (PRESENT(index)) index = matrix%lastblk
      IF (careful_mod) CALL timestop(handle)
   END SUBROUTINE add_work_coordinate

   SUBROUTINE dbcsr_merge_all(matrix, old_row_p, old_col_i, old_blk_p, &
                              sort_data)
      !! Merge data from matrix and work matrices into the final matrix.

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! matrix to work on
      INTEGER, DIMENSION(*), INTENT(IN)                  :: old_row_p, old_col_i, old_blk_p
      LOGICAL, INTENT(IN)                                :: sort_data
         !! whether data will be fully sorted

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_merge_all'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle, my_row_count, nblks, &
                                                            nrows, nwms, row, t, &
                                                            which_wm, wm_datasize
      INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: all_data_offsets, all_data_sizes, &
                                                            new_blk_p_sorted, new_blk_sizes, &
                                                            new_row_p
      INTEGER, ALLOCATABLE, DIMENSION(:), SAVE, TARGET   :: blk_d, new_blk_p, new_col_i
      INTEGER, DIMENSION(:), CONTIGUOUS, POINTER         :: my_row_p
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS, SAVE   :: cbs, rbs
      INTEGER, SAVE                                      :: max_row_count, new_nblks = 0, new_nze = 0
      TYPE(dbcsr_data_obj), ALLOCATABLE, DIMENSION(:), &
         SAVE                                            :: all_data_areas
      TYPE(dbcsr_work_type), POINTER                     :: wm
      TYPE(i_array_p), DIMENSION(:), POINTER, SAVE       :: all_blk_p, all_col_i, all_row_p

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

      ! Outline:
      ! Each thread has a work matrix.  These must be merged and made
      ! into a new index.  If sort_data is False, then the data areas
      ! are simply appended.  This is probably quicker but the data
      ! blocks are not in order and the data size may expand beyond
      ! what is needed.  If sort_data is True, then data blocks are
      ! sorted in order.

      IF (dbg) WRITE (*, *) routineN//" starting, sort?", sort_data
!$OMP     BARRIER
      nrows = matrix%nblkrows_total
      nwms = SIZE(matrix%wms)
!$    IF (nwms /= OMP_GET_NUM_THREADS()) &
!$       DBCSR_ABORT("Number of threads does not match number of work matrices.")
      which_wm = 1
!$    which_wm = OMP_GET_THREAD_NUM() + 1
      wm => matrix%wms(which_wm)
      !
      ! Currently B-tree-based work matrices are converted to array form.
      IF (dbcsr_wm_use_mutable(wm)) THEN
         SELECT CASE (wm%mutable%m%data_type)
         CASE (dbcsr_type_real_4)
            CALL tree_to_linear_s(wm)
         CASE (dbcsr_type_real_8)
            CALL tree_to_linear_d(wm)
         CASE (dbcsr_type_complex_4)
            CALL tree_to_linear_c(wm)
         CASE (dbcsr_type_complex_8)
            CALL tree_to_linear_z(wm)
         CASE default
            DBCSR_ABORT("Invalid data type")
         END SELECT
      END IF
      !
      ! Initializations and some counts from the threads are summed.
      !
!$OMP     MASTER
      new_nblks = old_row_p(nrows + 1)
      new_nze = matrix%nze
      ALLOCATE (all_row_p(nwms))
      ALLOCATE (all_col_i(nwms))
      ALLOCATE (all_blk_p(nwms))
      ALLOCATE (all_data_sizes(0:nwms))
      ALLOCATE (all_data_offsets(nwms))
      IF (sort_data) THEN
         ALLOCATE (all_data_areas(0:nwms))
         CALL dbcsr_data_init(all_data_areas(0))
         all_data_areas(0) = matrix%data_area
!$OMP        CRITICAL (crit_data)
         CALL dbcsr_data_hold(all_data_areas(0))
!$OMP        END CRITICAL (crit_data)
      END IF
      ! The following is valid because the data actually referenced
      ! by blocks is explicitly calculated in dbcsr_finalize.
      all_data_sizes(0) = matrix%nze
      rbs => array_data(matrix%row_blk_size)
      cbs => array_data(matrix%col_blk_size)
!$OMP     END MASTER
      !
!$OMP     BARRIER
      IF (sort_data .AND. wm%datasize_after_filtering .GE. 0) THEN
         wm_datasize = wm%datasize_after_filtering
      ELSE
         wm_datasize = wm%datasize
      END IF
      nblks = wm%lastblk
!$OMP     ATOMIC
      new_nblks = new_nblks + nblks
!$OMP     ATOMIC
      new_nze = new_nze + wm_datasize
!$OMP     BARRIER
      !
!$OMP     MASTER
      ALLOCATE (new_row_p(nrows + 1))
      ALLOCATE (new_col_i(new_nblks))
      ALLOCATE (new_blk_p(new_nblks))
      IF (sort_data) THEN
         ALLOCATE (blk_d(new_nblks))
      ELSE
         ALLOCATE (blk_d(1))
      END IF
!$OMP     END MASTER
      !
      ! Each thread creates a row_p index for its new blocks.  This gives the
      ! number of new blocks per thread per row.
      CALL dbcsr_sort_indices(nblks, wm%row_i, wm%col_i, wm%blk_p)
      ALLOCATE (my_row_p(nrows + 1))
      CALL dbcsr_make_dbcsr_index(my_row_p, wm%row_i, nrows, nblks)
      !
      ! The threads must share their row_p, col_i, blk_p, and data areas
      ! among themselves.
      all_row_p(which_wm)%p => my_row_p
      all_data_sizes(which_wm) = wm_datasize
      IF (.NOT. sort_data) THEN
         IF (wm_datasize > dbcsr_data_get_size_referenced(wm%data_area)) &
            DBCSR_ABORT("Data size mismatch.")
      END IF
      all_col_i(which_wm)%p => wm%col_i
      all_blk_p(which_wm)%p => wm%blk_p
!$OMP     BARRIER
      IF (sort_data) THEN
!$OMP        MASTER
         all_data_offsets(:) = 0
!$OMP        END MASTER
         CALL dbcsr_data_init(all_data_areas(which_wm))
         all_data_areas(which_wm) = wm%data_area
!$OMP        CRITICAL (crit_data)
         CALL dbcsr_data_hold(all_data_areas(which_wm))
!$OMP        END CRITICAL (crit_data)
      ELSE
!$OMP        MASTER
         all_data_offsets(1) = all_data_sizes(0)
         DO t = 2, nwms
            all_data_offsets(t) = all_data_offsets(t - 1) + all_data_sizes(t - 1)
         END DO
!$OMP        END MASTER
      END IF
      !
      ! The new row counts are created, then the new row_p index is created.
      !
!$OMP     DO
      DO row = 1, nrows
         my_row_count = old_row_p(row + 1) - old_row_p(row)
         DO t = 1, nwms
            my_row_count = my_row_count &
                 &       + all_row_p(t)%p(row + 1) - all_row_p(t)%p(row)
         END DO
         new_row_p(row) = my_row_count
      END DO
!$OMP     END DO
!$OMP     MASTER
      max_row_count = MAXVAL(new_row_p(1:nrows))
      CALL dbcsr_build_row_index(new_row_p, nrows)
!$OMP     END MASTER
!$OMP     BARRIER
      !
      ! The new index is built
      CALL merge_index(new_row_p, new_col_i, new_blk_p, &
                       blk_d, &
                       old_row_p, old_col_i, old_blk_p, &
                       all_row_p, all_col_i, all_blk_p, &
                       all_data_offsets, nwms, nrows, max_row_count, sort_data)
      !
      ! The data is then merged in one of two ways.
      !
!$OMP     MASTER
      IF (sort_data) THEN
         ! The matrix gets a fresh data area.  Blocks from the work
         ! matrices will be copied into it in order.
         !
!$OMP        CRITICAL (crit_data)
         CALL dbcsr_data_release(matrix%data_area)
         CALL dbcsr_data_init(matrix%data_area)
!$OMP        END CRITICAL (crit_data)
         CALL dbcsr_data_new(matrix%data_area, &
                             data_size=new_nze, &
                             data_type=dbcsr_data_get_type(all_data_areas(0)), &
                             memory_type=dbcsr_data_get_memory_type(all_data_areas(0)))
         ALLOCATE (new_blk_p_sorted(new_nblks))
         ALLOCATE (new_blk_sizes(new_nblks))
      ELSE
         ! Data stored in the work matrices will be just appended to the
         ! current data area.
         CALL dbcsr_data_ensure_size(matrix%data_area, new_nze)
      END IF
!$OMP     END MASTER
!$OMP     BARRIER
      IF (sort_data) THEN
         CALL dbcsr_calc_block_sizes(new_blk_sizes, &
                                     new_row_p, new_col_i, rbs, cbs)
         CALL dbcsr_sort_data(new_blk_p_sorted, new_blk_p, &
                              new_blk_sizes, dsts=matrix%data_area, &
                              src=all_data_areas(0), &
                              srcs=all_data_areas, old_blk_d=blk_d)
      ELSE
         IF (all_data_sizes(which_wm) .GT. 0) THEN
            CALL dbcsr_data_copy(dst=matrix%data_area, &
                                 dst_lb=(/all_data_offsets(which_wm) + 1/), &
                                 dst_sizes=(/all_data_sizes(which_wm)/), &
                                 src=wm%data_area, &
                                 src_lb=(/1/), &
                                 src_sizes=(/all_data_sizes(which_wm)/))
         END IF
      END IF
      !
      ! Creates a new index array.
      !
!$OMP     BARRIER
!$OMP     MASTER
      CALL dbcsr_clearfrom_index_array(matrix, dbcsr_slot_row_p)
      CALL dbcsr_clearfrom_index_array(matrix, dbcsr_slot_col_i)
      CALL dbcsr_clearfrom_index_array(matrix, dbcsr_slot_blk_p)
      CALL dbcsr_addto_index_array(matrix, dbcsr_slot_row_p, &
                                   DATA=new_row_p(1:nrows + 1), extra=new_nblks*2)
      CALL dbcsr_addto_index_array(matrix, dbcsr_slot_col_i, &
                                   DATA=new_col_i(1:new_nblks))
      IF (sort_data) THEN
         CALL dbcsr_addto_index_array(matrix, dbcsr_slot_blk_p, &
                                      DATA=new_blk_p_sorted)
      ELSE
         CALL dbcsr_addto_index_array(matrix, dbcsr_slot_blk_p, &
                                      DATA=new_blk_p)
      END IF
      matrix%nblks = new_nblks
      matrix%nze = new_nze
      matrix%index(dbcsr_slot_nblks) = matrix%nblks
      matrix%index(dbcsr_slot_nze) = matrix%nze
      CALL dbcsr_repoint_index(matrix)
!$OMP     END MASTER
      !
!$OMP     BARRIER
      !
      DEALLOCATE (my_row_p)
      IF (sort_data) THEN
!$OMP        MASTER
!$OMP        CRITICAL (crit_data)
         CALL dbcsr_data_release(all_data_areas(0))
!$OMP        END CRITICAL (crit_data)
         DO which_wm = 1, nwms
!$OMP           CRITICAL (crit_data)
            CALL dbcsr_data_release(all_data_areas(which_wm))
!$OMP           END CRITICAL (crit_data)
         END DO
!$OMP        END MASTER
      END IF
!$OMP     BARRIER
!$OMP     MASTER
      DEALLOCATE (all_row_p)
      DEALLOCATE (all_col_i)
      DEALLOCATE (all_blk_p)
      DEALLOCATE (new_row_p)
      DEALLOCATE (new_col_i)
      DEALLOCATE (new_blk_p)
      DEALLOCATE (all_data_sizes)
      DEALLOCATE (all_data_offsets)
      IF (sort_data) THEN
         DEALLOCATE (all_data_areas)
         DEALLOCATE (new_blk_sizes)
         DEALLOCATE (new_blk_p_sorted)
      END IF
      DEALLOCATE (blk_d)
!$OMP     END MASTER
!$OMP     BARRIER
      IF (dbg) WRITE (*, *) routineN//" stopped"
      CALL timestop(handle)
   END SUBROUTINE dbcsr_merge_all

   SUBROUTINE dbcsr_merge_single_wm(matrix)
      !! Sort data from the WM into the final matrix, based closely on dbcsr_merge_all

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! matrix to work on

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_merge_single_wm'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle, nblks, nrows
      INTEGER, ALLOCATABLE, DIMENSION(:), SAVE           :: new_blk_p_sorted, new_blk_sizes, &
                                                            new_row_p
      INTEGER, ALLOCATABLE, DIMENSION(:), SAVE, TARGET   :: blk_d
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS, SAVE   :: cbs, rbs
      TYPE(dbcsr_work_type), POINTER                     :: wm

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

      ! Outline:
      ! There is a single work matrix.  Data blocks are sorted and copied
      ! into the matrix data area (which is empty).  The index is made consistent

      nrows = matrix%nblkrows_total
      wm => matrix%wms(1)
      nblks = wm%lastblk
      IF (dbcsr_wm_use_mutable(wm)) &
         DBCSR_ABORT("Number of threads does not match number of work matrices.")
      !
      ! The following is valid because the data actually referenced
      ! by blocks is explicitly calculated in dbcsr_finalize.
!$OMP     MASTER
      rbs => array_data(matrix%row_blk_size)
      cbs => array_data(matrix%col_blk_size)
      !
      ! Initializations
      !
      ALLOCATE (new_row_p(nrows + 1))
      ALLOCATE (blk_d(nblks))
      ALLOCATE (new_blk_p_sorted(nblks))
      ALLOCATE (new_blk_sizes(nblks))
      !
      ! Master thread creates a row_p index for the (sorted) blocks.
      CALL dbcsr_sort_indices(nblks, wm%row_i, wm%col_i, wm%blk_p)
      CALL dbcsr_make_dbcsr_index(new_row_p, wm%row_i, nrows, nblks)
      !
      !
      ! The matrix data area is resized.  Blocks from the work
      ! matrices will be copied into it in order.
      !
      CALL dbcsr_data_ensure_size(matrix%data_area, wm%datasize, nocopy=.TRUE.)
!$OMP     END MASTER
!$OMP     BARRIER
      CALL dbcsr_calc_block_sizes(new_blk_sizes, &
                                  new_row_p, wm%col_i, rbs, cbs)
      CALL dbcsr_sort_data(new_blk_p_sorted, wm%blk_p, &
                           new_blk_sizes, dsts=matrix%data_area, &
                           src=wm%data_area)
      !
      ! Creates a new index array.
      !
!$OMP     BARRIER
!$OMP     MASTER
      CALL dbcsr_clearfrom_index_array(matrix, dbcsr_slot_row_p)
      CALL dbcsr_clearfrom_index_array(matrix, dbcsr_slot_col_i)
      CALL dbcsr_clearfrom_index_array(matrix, dbcsr_slot_blk_p)
      CALL dbcsr_addto_index_array(matrix, dbcsr_slot_row_p, &
                                   DATA=new_row_p(1:nrows + 1), extra=nblks*2)
      CALL dbcsr_addto_index_array(matrix, dbcsr_slot_col_i, &
                                   DATA=wm%col_i(1:nblks))
      CALL dbcsr_addto_index_array(matrix, dbcsr_slot_blk_p, &
                                   DATA=new_blk_p_sorted)
      matrix%nblks = nblks
      matrix%nze = wm%datasize
      matrix%index(dbcsr_slot_nblks) = matrix%nblks
      matrix%index(dbcsr_slot_nze) = matrix%nze
      CALL dbcsr_repoint_index(matrix)
      DEALLOCATE (new_row_p)
      DEALLOCATE (new_blk_sizes)
      DEALLOCATE (new_blk_p_sorted)
      DEALLOCATE (blk_d)
!$OMP     END MASTER
      IF (dbg) WRITE (*, *) routineN//" stopped"
      CALL timestop(handle)
   END SUBROUTINE dbcsr_merge_single_wm

   SUBROUTINE merge_index(new_row_p, new_col_i, new_blk_p, &
      !! Builds a new index from several work matrices.
                          blk_d, old_row_p, old_col_i, old_blk_p, &
                          all_row_p, all_col_i, all_blk_p, &
                          all_data_offsets, nwms, nrows, max_row_count, sort_data)
      INTEGER, DIMENSION(*), INTENT(IN)                  :: new_row_p
      INTEGER, DIMENSION(*), INTENT(OUT), TARGET         :: new_col_i, new_blk_p
      INTEGER, DIMENSION(*), INTENT(IN), TARGET          :: blk_d
      INTEGER, DIMENSION(*), INTENT(IN)                  :: old_row_p, old_col_i, old_blk_p
      TYPE(i_array_p), DIMENSION(*), INTENT(IN)          :: all_row_p, all_col_i, all_blk_p
      INTEGER, DIMENSION(*), INTENT(IN)                  :: all_data_offsets
      INTEGER, INTENT(IN)                                :: nwms, nrows, max_row_count
      LOGICAL, INTENT(IN)                                :: sort_data

      CHARACTER(len=*), PARAMETER :: routineN = 'merge_index'
      INTEGER                                            :: blk1, blk2, error_handle, first_row_blk, &
                                                            last_row_blk, row, src_blk_1, &
                                                            src_blk_2, t
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: blk_p_buff, tmp_arr
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: blk_span, col_span, d_span

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, error_handle)
      !
      ALLOCATE (tmp_arr(max_row_count))
      ALLOCATE (blk_p_buff(max_row_count))
      !
!$OMP     DO
      DO row = 1, nrows
         first_row_blk = new_row_p(row) + 1
         last_row_blk = new_row_p(row + 1)
         col_span => new_col_i(first_row_blk:last_row_blk)
         blk_span => new_blk_p(first_row_blk:last_row_blk)
         IF (sort_data) d_span => blk_d(first_row_blk:last_row_blk)
         src_blk_1 = old_row_p(row) + 1
         src_blk_2 = old_row_p(row + 1)
         blk1 = 1
         blk2 = blk1 + (src_blk_2 - src_blk_1 + 1) - 1
         col_span(blk1:blk2) = old_col_i(src_blk_1:src_blk_2)
         blk_span(blk1:blk2) = old_blk_p(src_blk_1:src_blk_2)
         IF (sort_data) THEN
            d_span(blk1:blk2) = 1
            DO t = 1, nwms
               src_blk_1 = all_row_p(t)%p(row) + 1
               src_blk_2 = all_row_p(t)%p(row + 1)
               blk1 = blk2 + 1
               blk2 = blk1 + (src_blk_2 - src_blk_1 + 1) - 1
               col_span(blk1:blk2) = all_col_i(t)%p(src_blk_1:src_blk_2)
               blk_span(blk1:blk2) = all_blk_p(t)%p(src_blk_1:src_blk_2)
               d_span(blk1:blk2) = t + 1
            END DO
         ELSE
            DO t = 1, nwms
               src_blk_1 = all_row_p(t)%p(row) + 1
               src_blk_2 = all_row_p(t)%p(row + 1)
               blk1 = blk2 + 1
               blk2 = blk1 + (src_blk_2 - src_blk_1 + 1) - 1
               col_span(blk1:blk2) = all_col_i(t)%p(src_blk_1:src_blk_2)
               blk_span(blk1:blk2) = all_blk_p(t)%p(src_blk_1:src_blk_2) &
                    &              + all_data_offsets(t)
            END DO
         END IF
         CALL sort(col_span, SIZE(col_span), tmp_arr)
         blk_p_buff(1:SIZE(blk_span)) = blk_span(:)
         blk_span(1:SIZE(blk_span)) = blk_p_buff(tmp_arr(1:SIZE(blk_span)))
      END DO
!$OMP     END DO
      !
      DEALLOCATE (tmp_arr)
      DEALLOCATE (blk_p_buff)
      !
      CALL timestop(error_handle)
   END SUBROUTINE merge_index

   #:include '../data/dbcsr.fypp'
   #:for n, nametype1, base1, prec1, kind1, type1, dkind1 in inst_params_float
      SUBROUTINE tree_to_linear_${nametype1}$ (wm)
     !! Converts mutable data to linear (array) type.

         USE dbcsr_btree, &
            ONLY: btree_2d_data_${nametype1}$ => btree_data_${nametype1}$p2d, &
                  btree_destroy_${nametype1}$ => btree_delete, &
                  btree_size_${nametype1}$ => btree_get_entries
         TYPE(dbcsr_work_type), INTENT(INOUT)     :: wm
        !! work matrix to convert

         CHARACTER(len=*), PARAMETER :: routineN = 'tree_to_linear_${nametype1}$'

         INTEGER                                  :: blk, blk_p, treesize, &
                                                     error_handler, needed_size
         INTEGER(KIND=int_8), ALLOCATABLE, &
            DIMENSION(:)                           :: keys
         ${type1}$, DIMENSION(:), POINTER, CONTIGUOUS :: target_data
         ${type1}$, DIMENSION(:, :), POINTER        :: block_2d
         TYPE(btree_2d_data_${nametype1}$), ALLOCATABLE, &
            DIMENSION(:)                           :: values

!   ---------------------------------------------------------------------------

         CALL timeset(routineN, error_handler)
         ! srt = .TRUE. ! Not needed because of the copy
         treesize = btree_size_${nametype1}$ (wm%mutable%m%btree_${nametype1}$)
         IF (wm%lastblk .NE. treesize) &
            DBCSR_ABORT("Mismatch in number of blocks")
         ALLOCATE (keys(treesize), values(treesize))
         CALL btree_destroy_${nametype1}$ (wm%mutable%m%btree_${nametype1}$, keys, values)
         CALL ensure_array_size(wm%row_i, ub=treesize)
         CALL ensure_array_size(wm%col_i, ub=treesize)
         CALL dbcsr_unpack_i8_2i4(keys, wm%row_i, &
                                  wm%col_i)
         ! For now we also fill the data, sloooowly, but this should
         ! be avoided and the data should be copied directly from the
         ! source in the subroutine's main loop.
         CALL ensure_array_size(wm%blk_p, ub=treesize)
         needed_size = 0
         DO blk = 1, treesize
            block_2d => values(blk)%p
            needed_size = needed_size + SIZE(block_2d)
         END DO
         wm%datasize = needed_size
         CALL dbcsr_data_ensure_size(wm%data_area, &
                                     wm%datasize)
         target_data => dbcsr_get_data_p_${nametype1}$ (wm%data_area)
         blk_p = 1
         DO blk = 1, treesize
            block_2d => values(blk)%p
            IF (.NOT. values(blk)%tr) THEN
               wm%blk_p(blk) = blk_p
            ELSE
               wm%blk_p(blk) = -blk_p
            END IF
            CALL block_copy_${nametype1}$ (target_data, block_2d, &
                                           SIZE(block_2d), blk_p, 1)
            blk_p = blk_p + SIZE(block_2d)
            DEALLOCATE (block_2d)
         END DO
         DEALLOCATE (keys, values)
         CALL dbcsr_mutable_release(wm%mutable)
         CALL timestop(error_handler)
      END SUBROUTINE tree_to_linear_${nametype1}$

   #:endfor

END MODULE dbcsr_work_operations
